In [1]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')
In [2]:
#read data
df=pd.read_csv(r"C:\Users\sunil\Downloads\data.csv")
df
Out[2]:
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean ... texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst Unnamed: 32
0 842302 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.30010 0.14710 ... 17.33 184.60 2019.0 0.16220 0.66560 0.7119 0.2654 0.4601 0.11890 NaN
1 842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.08690 0.07017 ... 23.41 158.80 1956.0 0.12380 0.18660 0.2416 0.1860 0.2750 0.08902 NaN
2 84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.19740 0.12790 ... 25.53 152.50 1709.0 0.14440 0.42450 0.4504 0.2430 0.3613 0.08758 NaN
3 84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.24140 0.10520 ... 26.50 98.87 567.7 0.20980 0.86630 0.6869 0.2575 0.6638 0.17300 NaN
4 84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.19800 0.10430 ... 16.67 152.20 1575.0 0.13740 0.20500 0.4000 0.1625 0.2364 0.07678 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
564 926424 M 21.56 22.39 142.00 1479.0 0.11100 0.11590 0.24390 0.13890 ... 26.40 166.10 2027.0 0.14100 0.21130 0.4107 0.2216 0.2060 0.07115 NaN
565 926682 M 20.13 28.25 131.20 1261.0 0.09780 0.10340 0.14400 0.09791 ... 38.25 155.00 1731.0 0.11660 0.19220 0.3215 0.1628 0.2572 0.06637 NaN
566 926954 M 16.60 28.08 108.30 858.1 0.08455 0.10230 0.09251 0.05302 ... 34.12 126.70 1124.0 0.11390 0.30940 0.3403 0.1418 0.2218 0.07820 NaN
567 927241 M 20.60 29.33 140.10 1265.0 0.11780 0.27700 0.35140 0.15200 ... 39.42 184.60 1821.0 0.16500 0.86810 0.9387 0.2650 0.4087 0.12400 NaN
568 92751 B 7.76 24.54 47.92 181.0 0.05263 0.04362 0.00000 0.00000 ... 30.37 59.16 268.6 0.08996 0.06444 0.0000 0.0000 0.2871 0.07039 NaN

569 rows × 33 columns

In [3]:
df.columns
Out[3]:
Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')
In [4]:
#data overview
df.head()
Out[4]:
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean ... texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst Unnamed: 32
0 842302 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710 ... 17.33 184.60 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890 NaN
1 842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017 ... 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902 NaN
2 84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790 ... 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.2430 0.3613 0.08758 NaN
3 84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520 ... 26.50 98.87 567.7 0.2098 0.8663 0.6869 0.2575 0.6638 0.17300 NaN
4 84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430 ... 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.1625 0.2364 0.07678 NaN

5 rows × 33 columns

In [5]:
#drop the unnamed and id columns.
#useless
df=df.drop(columns=['Unnamed: 32','id'])
In [6]:
#no of rows and columns
df.shape
#there is 569 rows and 31columns i.e 30 features and one target class 
Out[6]:
(569, 31)
In [7]:
#lets check data types
df.dtypes
#all are numeric except target label 'diagnosis'
Out[7]:
diagnosis                   object
radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          float64
concavity_worst            float64
concave points_worst       float64
symmetry_worst             float64
fractal_dimension_worst    float64
dtype: object
In [8]:
df.describe()
Out[8]:
radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean fractal_dimension_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
count 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 ... 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000
mean 14.127292 19.289649 91.969033 654.889104 0.096360 0.104341 0.088799 0.048919 0.181162 0.062798 ... 16.269190 25.677223 107.261213 880.583128 0.132369 0.254265 0.272188 0.114606 0.290076 0.083946
std 3.524049 4.301036 24.298981 351.914129 0.014064 0.052813 0.079720 0.038803 0.027414 0.007060 ... 4.833242 6.146258 33.602542 569.356993 0.022832 0.157336 0.208624 0.065732 0.061867 0.018061
min 6.981000 9.710000 43.790000 143.500000 0.052630 0.019380 0.000000 0.000000 0.106000 0.049960 ... 7.930000 12.020000 50.410000 185.200000 0.071170 0.027290 0.000000 0.000000 0.156500 0.055040
25% 11.700000 16.170000 75.170000 420.300000 0.086370 0.064920 0.029560 0.020310 0.161900 0.057700 ... 13.010000 21.080000 84.110000 515.300000 0.116600 0.147200 0.114500 0.064930 0.250400 0.071460
50% 13.370000 18.840000 86.240000 551.100000 0.095870 0.092630 0.061540 0.033500 0.179200 0.061540 ... 14.970000 25.410000 97.660000 686.500000 0.131300 0.211900 0.226700 0.099930 0.282200 0.080040
75% 15.780000 21.800000 104.100000 782.700000 0.105300 0.130400 0.130700 0.074000 0.195700 0.066120 ... 18.790000 29.720000 125.400000 1084.000000 0.146000 0.339100 0.382900 0.161400 0.317900 0.092080
max 28.110000 39.280000 188.500000 2501.000000 0.163400 0.345400 0.426800 0.201200 0.304000 0.097440 ... 36.040000 49.540000 251.200000 4254.000000 0.222600 1.058000 1.252000 0.291000 0.663800 0.207500

8 rows × 30 columns

In [9]:
#check any null values in database
df.isnull().values.any()
Out[9]:
False
In [10]:
#lets count class labels
df['diagnosis'].value_counts()
Out[10]:
B    357
M    212
Name: diagnosis, dtype: int64
In [11]:
#Data visualization
#histogram
df.hist(bins=50,figsize=(15,15))
plt.show()
In [12]:
df.isna().sum()
Out[12]:
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64
In [13]:
#Scatter matrix to check correlation between two attributes
sns.pairplot(df)
plt.show()
In [14]:
#Count each label 
ax=sns.countplot(y='diagnosis',data=df,palette='Set1')
In [15]:
#Get a count of the number of 'M' & 'B' cells
df['diagnosis'].value_counts()
Out[15]:
B    357
M    212
Name: diagnosis, dtype: int64
In [16]:
#Visualize this count 
sns.countplot(df['diagnosis'],label="Count")
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x1c9760ece80>
In [17]:
#lets find correlation
plt.figure(figsize=(20,20))  
sns.heatmap(df.corr(), annot=True, fmt='.0%', cmap='Blues')
Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x1c9765f7198>
In [18]:
#box plot to check outlier in each category

#define function can be call later 
def boxPlot(dff):
    d=dff.drop(columns=['diagnosis'])
    for column in d:
        plt.figure(figsize=(5,2))
        sns.boxplot(x=column,data=d,palette="colorblind")
boxPlot(df)
In [19]:
Q1=df.quantile(0.25)
Q3=df.quantile(0.75)
IQR=Q3-Q1

##---quartiles and IQR

print("Quartile 1:\n",Q1)
print("\nQuartile 3:\n",Q3)
print("\nIQR :\n",IQR)

#--display outlier
print((df<(Q1-1.5*IQR))|(df>(Q3+1.5*IQR)))
Quartile 1:
 radius_mean                 11.700000
texture_mean                16.170000
perimeter_mean              75.170000
area_mean                  420.300000
smoothness_mean              0.086370
compactness_mean             0.064920
concavity_mean               0.029560
concave points_mean          0.020310
symmetry_mean                0.161900
fractal_dimension_mean       0.057700
radius_se                    0.232400
texture_se                   0.833900
perimeter_se                 1.606000
area_se                     17.850000
smoothness_se                0.005169
compactness_se               0.013080
concavity_se                 0.015090
concave points_se            0.007638
symmetry_se                  0.015160
fractal_dimension_se         0.002248
radius_worst                13.010000
texture_worst               21.080000
perimeter_worst             84.110000
area_worst                 515.300000
smoothness_worst             0.116600
compactness_worst            0.147200
concavity_worst              0.114500
concave points_worst         0.064930
symmetry_worst               0.250400
fractal_dimension_worst      0.071460
Name: 0.25, dtype: float64

Quartile 3:
 radius_mean                  15.780000
texture_mean                 21.800000
perimeter_mean              104.100000
area_mean                   782.700000
smoothness_mean               0.105300
compactness_mean              0.130400
concavity_mean                0.130700
concave points_mean           0.074000
symmetry_mean                 0.195700
fractal_dimension_mean        0.066120
radius_se                     0.478900
texture_se                    1.474000
perimeter_se                  3.357000
area_se                      45.190000
smoothness_se                 0.008146
compactness_se                0.032450
concavity_se                  0.042050
concave points_se             0.014710
symmetry_se                   0.023480
fractal_dimension_se          0.004558
radius_worst                 18.790000
texture_worst                29.720000
perimeter_worst             125.400000
area_worst                 1084.000000
smoothness_worst              0.146000
compactness_worst             0.339100
concavity_worst               0.382900
concave points_worst          0.161400
symmetry_worst                0.317900
fractal_dimension_worst       0.092080
Name: 0.75, dtype: float64

IQR :
 radius_mean                  4.080000
texture_mean                 5.630000
perimeter_mean              28.930000
area_mean                  362.400000
smoothness_mean              0.018930
compactness_mean             0.065480
concavity_mean               0.101140
concave points_mean          0.053690
symmetry_mean                0.033800
fractal_dimension_mean       0.008420
radius_se                    0.246500
texture_se                   0.640100
perimeter_se                 1.751000
area_se                     27.340000
smoothness_se                0.002977
compactness_se               0.019370
concavity_se                 0.026960
concave points_se            0.007072
symmetry_se                  0.008320
fractal_dimension_se         0.002310
radius_worst                 5.780000
texture_worst                8.640000
perimeter_worst             41.290000
area_worst                 568.700000
smoothness_worst             0.029400
compactness_worst            0.191900
concavity_worst              0.268400
concave points_worst         0.096470
symmetry_worst               0.067500
fractal_dimension_worst      0.020620
dtype: float64
     area_mean  area_se  area_worst  compactness_mean  compactness_se  \
0        False     True        True              True           False   
1        False    False        True             False           False   
2        False     True       False             False           False   
3        False    False       False              True            True   
4        False     True       False             False           False   
..         ...      ...         ...               ...             ...   
564       True     True        True             False           False   
565      False     True       False             False           False   
566      False    False       False             False           False   
567      False     True       False              True            True   
568      False    False       False             False           False   

     compactness_worst  concave points_mean  concave points_se  \
0                 True                False              False   
1                False                False              False   
2                False                False              False   
3                 True                False              False   
4                False                False              False   
..                 ...                  ...                ...   
564              False                False              False   
565              False                False              False   
566              False                False              False   
567               True                False              False   
568              False                False              False   

     concave points_worst  concavity_mean  ...  radius_worst  smoothness_mean  \
0                   False            True  ...         False            False   
1                   False           False  ...         False            False   
2                   False           False  ...         False            False   
3                   False           False  ...         False             True   
4                   False           False  ...         False            False   
..                    ...             ...  ...           ...              ...   
564                 False           False  ...         False            False   
565                 False           False  ...         False            False   
566                 False           False  ...         False            False   
567                 False            True  ...         False            False   
568                 False           False  ...         False             True   

     smoothness_se  smoothness_worst  symmetry_mean  symmetry_se  \
0            False             False          False        False   
1            False             False          False        False   
2            False             False          False        False   
3            False              True           True         True   
4            False             False          False        False   
..             ...               ...            ...          ...   
564          False             False          False        False   
565          False             False          False        False   
566          False             False          False        False   
567          False             False          False        False   
568          False             False          False        False   

     symmetry_worst  texture_mean  texture_se  texture_worst  
0              True         False       False          False  
1             False         False       False          False  
2             False         False       False          False  
3              True         False       False          False  
4             False         False       False          False  
..              ...           ...         ...            ...  
564           False         False       False          False  
565           False         False        True          False  
566           False         False       False          False  
567           False         False       False          False  
568           False         False       False          False  

[569 rows x 31 columns]
In [20]:
#remove all outlier
# < Q1-1.5*IQR
# > Q3+1.5*IQR

df_out = df[~((df < (Q1 - (1.5 * IQR))) |(df > (Q3 + (1.5 * IQR)))).any(axis=1)]
df.shape,df_out.shape
Out[20]:
((569, 31), (398, 31))
In [21]:
#good to go..
#Lets seprate labels and features

X=df_out.drop(columns=['diagnosis'])
y=df_out['diagnosis']
y
Out[21]:
6      M
7      M
10     M
11     M
13     M
      ..
554    B
555    B
558    B
560    B
566    M
Name: diagnosis, Length: 398, dtype: object
In [22]:
#--visualize again boxplot
boxPlot(df_out)
In [23]:
#Encoding categorical data values
from sklearn.preprocessing import LabelEncoder
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
y
Out[23]:
array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1])
In [24]:
# Splitting the dataset into the Training set and Test setfrom sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
In [25]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
In [26]:
#Fitting the Logistic Regression Algorithm to the Training Set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
#Accuracy
print(f"Logistic Regression training set classification score: {format(classifier.score(X_train, y_train), '.4f')} ")
print(f"Logistic Regression testing set classification score: {format(classifier.score(X_test, y_test), '.4f')} ")
#predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred 
cm = confusion_matrix(y_test, y_pred)
TN = cm[0][0]
TP = cm[1][1]
FN = cm[1][0]
FP = cm[0][1]
cm
Logistic Regression training set classification score: 0.9899 
Logistic Regression testing set classification score: 0.9700 
Out[26]:
array([[73,  0],
       [ 3, 24]], dtype=int64)
In [27]:
#Fitting K-NN Algorithm
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)
#Accuracy
print(f"KNN training set classification score: {format(classifier.score(X_train, y_train), '.4f')} ")
print(f"KNN Regression testing set classification score: {format(classifier.score(X_test, y_test), '.4f')} ")
      
#predicting the Test set results
y_pred = classifier.predict(X_test)

TN = cm[0][0]
TP = cm[1][1]
FN = cm[1][0]
FP = cm[0][1]
cm = confusion_matrix(y_test, y_pred)
cm
KNN training set classification score: 0.9664 
KNN Regression testing set classification score: 0.9600 
Out[27]:
array([[72,  1],
       [ 3, 24]], dtype=int64)
In [28]:
#Fitting SVM
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train) 
#Accuracy
print(f"SVM training set classification score: {format(classifier.score(X_train, y_train), '.4f')} ")
print(f"SVM testing set classification score: {format(classifier.score(X_test, y_test), '.4f')} ")

#predicting the Test set results
y_pred = classifier.predict(X_test)

TN = cm[0][0]
TP = cm[1][1]
FN = cm[1][0]
FP = cm[0][1]
cm = confusion_matrix(y_test, y_pred)
cm
SVM training set classification score: 0.9899 
SVM testing set classification score: 0.9500 
Out[28]:
array([[71,  2],
       [ 3, 24]], dtype=int64)
In [29]:
#Fitting K-SVM
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)
#Accuracy
print(f"K-SVM training set classification score: {format(classifier.score(X_train, y_train), '.4f')} ")
print(f"K-SVM testing set classification score: {format(classifier.score(X_test, y_test), '.4f')} ")
#predicting the Test set results
y_pred = classifier.predict(X_test)

TN = cm[0][0]
TP = cm[1][1]
FN = cm[1][0]
FP = cm[0][1]
cm = confusion_matrix(y_test, y_pred)
cm
K-SVM training set classification score: 0.9866 
K-SVM testing set classification score: 0.9600 
Out[29]:
array([[72,  1],
       [ 3, 24]], dtype=int64)
In [30]:
#Fitting Naive_Bayes
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
#Accuracy
print(f"Naive Bayes training set classification score: {format(classifier.score(X_train, y_train), '.4f')} ")
print(f"Naive Bayes testing set classification score: {format(classifier.score(X_test, y_test), '.4f')} ")
#predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred 
TN = cm[0][0]
TP = cm[1][1]
FN = cm[1][0]
FP = cm[0][1]
cm = confusion_matrix(y_test, y_pred)
cm
Naive Bayes training set classification score: 0.9329 
Naive Bayes testing set classification score: 0.9600 
Out[30]:
array([[72,  1],
       [ 3, 24]], dtype=int64)
In [31]:
#Fitting Decision Tree Algorithm
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)
#Accuracy
print(f"Decision Tree training set classification score: {format(classifier.score(X_train, y_train), '.4f')} ")
print(f"Decision Tree testing set classification score: {format(classifier.score(X_test, y_test), '.4f')} ")

#predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred 
TN = cm[0][0]
TP = cm[1][1]
FN = cm[1][0]
FP = cm[0][1]
cm = confusion_matrix(y_test, y_pred)
cm
Decision Tree training set classification score: 1.0000 
Decision Tree testing set classification score: 0.9200 
Out[31]:
array([[72,  1],
       [ 7, 20]], dtype=int64)
In [32]:
#Fitting Random Forest Classification Algorithm
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)
#Accuracy
print(f"Random Forest training set classification score: {format(classifier.score(X_train, y_train), '.4f')} ")
print(f"Random Forest testing set classification score: {format(classifier.score(X_test, y_test), '.4f')} ")

#predicting the Test set results
y_pred = classifier.predict(X_test)

TN = cm[0][0]
TP = cm[1][1]
FN = cm[1][0]
FP = cm[0][1]
cm = confusion_matrix(y_test, y_pred)
cm
Random Forest training set classification score: 0.9832 
Random Forest testing set classification score: 0.9400 
Out[32]:
array([[73,  0],
       [ 6, 21]], dtype=int64)
In [33]:
label = ['Logistic Regression','K-NN','SVM','K-SVM','Naive Bayes','Decision Tree','Random Forest']
accuracy=[0.97,0.96,0.95,0.96,0.95,0.92,0.94]
# this is for plotting purpose
index = np.arange(len(label))
plt.bar(index, accuracy,color=['k', 'r', 'g', 'b', 'c', 'y', 'm'])
plt.xlabel('Algorithms', fontsize=10)
plt.ylabel('Accuracy', fontsize=10)
plt.xticks(index, label, fontsize=10, rotation=90)
plt.title('Best suited classification model')
plt.show()
In [ ]: